Setup

suppressPackageStartupMessages({
  ## Common
  library(tidyverse)
  library(magrittr)
  library(future.apply)
  library(here)
  library(AnnotationHub)
  library(purrr)
  library(scales)
  library(kableExtra)
  library(tictoc)
  library(ggrepel)
  library(RColorBrewer)
  library(ggpubr)
  library(pander)
  library(rmarkdown)
  ## Project specific
  library(UpSetR)
  library(SeqArray)
  library(ngsReports)
})
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- availableCores() - 1
source("~/bioinformatics/bioToolkit/lbFuncs.R")

EnsDb

ah <- AnnotationHub() %>%
  subset(species == "Danio rerio") %>%
  subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83189"]] ## Ens101
genes <- genes(ensDb)
mcols(genes) <- mcols(genes)[
  c("gene_id", "gene_name", "gene_biotype", "entrezid")
]
exons <- exonsBy(ensDb, by = "gene")

An EnsDb object for Ensemble release 101 was setup for extracting gene and exon annotation information.

Metadata

metadata <- read_csv(here("files/SraRunTable.txt")) %>%
  as.data.frame() %>%
  dplyr::arrange(Run) %>%
  mutate(
    Genotype = factor(Genotype, levels = unique(Genotype)),
    Run = factor(Run, levels = Run), 
    alias = c(
      paste0(rep("WT", 8), seq(1, 8)),
      paste0(rep("V1482Afs", 6), seq(1, 6)),
      paste0(rep("R122Pfs", 4), seq(1, 4)),
      paste0(rep("Trans", 6), seq(1, 6))
    )
  ) %>%
  dplyr::select(
    sample = Run, genotype = Genotype, alias, gender, tank = Tank
  )
metadata %>%
  dplyr::rename(
    Sample = sample, Genotype = genotype, Alias = alias, Gender = gender,
    Tank = tank
  ) %>%
  kable(
    align = "l",
    caption = "Sample metadata"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Sample metadata
Sample Genotype Alias Gender Tank
SRR11951228 wild-type WT1 female 1
SRR11951229 wild-type WT2 female 3
SRR11951230 wild-type WT3 male 2
SRR11951231 wild-type WT4 male 3
SRR11951232 wild-type WT5 male 1
SRR11951233 wild-type WT6 female 3
SRR11951234 wild-type WT7 male 2
SRR11951235 wild-type WT8 female 1
SRR11951236 sorl1V1482Afs/+ V1482Afs1 male 1
SRR11951237 sorl1V1482Afs/+ V1482Afs2 male 2
SRR11951238 sorl1V1482Afs/+ V1482Afs3 male 3
SRR11951239 sorl1V1482Afs/+ V1482Afs4 female 1
SRR11951240 sorl1V1482Afs/+ V1482Afs5 female 2
SRR11951241 sorl1V1482Afs/+ V1482Afs6 female 3
SRR11951242 sorl1R122Pfs/+ R122Pfs1 male 2
SRR11951243 sorl1R122Pfs/+ R122Pfs2 male 3
SRR11951244 sorl1R122Pfs/+ R122Pfs3 female 1
SRR11951245 sorl1R122Pfs/+ R122Pfs4 female 1
SRR11951246 sorl1V1482Afs/R122Pfs Trans1 female 1
SRR11951247 sorl1V1482Afs/R122Pfs Trans2 female 2
SRR11951248 sorl1V1482Afs/R122Pfs Trans3 female 3
SRR11951249 sorl1V1482Afs/R122Pfs Trans4 male 2
SRR11951250 sorl1V1482Afs/R122Pfs Trans5 male 1
SRR11951251 sorl1V1482Afs/R122Pfs Trans6 male 3
genoCols <- metadata$genotype %>%
  levels() %>%
  length() %>%
  brewer.pal("Set1") %>%
  setNames(levels(metadata$genotype))
drChrs <- seq(1:25)

Pre-processing

Raw data

rawFqc <- list.files(
  path = here("00_rawData/FastQC/"),
  pattern = "zip",
  full.names = TRUE
) %>%
  FastqcDataList()

Library sizes

Library Sizes for the raw, unprocessed dataset ranged between 30,648,602 and 37,751,699 reads.

plotReadTotals(rawFqc)

GC content

plotly::ggplotly({
  plotGcContent(
    x = rawFqc, 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Drerio"
  ) +
    theme(legend.position = "none")
})

GC content distributions for all samples. Hover for more details.

Trimmed data

Trimming software was setup carefully with the intention to perform downstream detection of short variants following the GATK workflow. Part of this process involves the removal of duplicate reads which relies on detection of duplicates by comparing sequences in the 5’ positions. Trimming can therefore be detrimental to downstream analysis. As such, the trimming procedure was performed more so as a filtering step, where reads were discarded if:

  1. 40% of the bases did not meet the threshold of a 20 phred quality score.
  2. reads were shorter than 35 base pairs in length.
trimFqc <- list.files(
  path = here("01_trim/FastQC/"),
  pattern = "zip",
  full.names = TRUE
) %>%
  FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
  dplyr::rename(Raw = Total_Sequences) %>%
  left_join(readTotals(trimFqc), by = "Filename") %>%
  dplyr::rename(Trimmed = Total_Sequences) %>%
  mutate(
    Discarded = 1 - Trimmed/Raw,
    Retained = Trimmed / Raw
  )

After trimming between 0.48% and 0.83% of reads were discarded.

Aligned data

Trimmed reads were aligned to GRCz11 reference genome (Ensembl release 101) with STAR 2.7.7a. STAR’s two-pass mode was chosen to achieve better alignments around novel splice junctions.

GATK short variant discovery

Duplicates

Marking duplicates is important because almost all statistical models for variant calling assume some sort of independence between measurements. Duplicates that arise from PCR artifacts are not independent. Picard MarkDuplicates tool classifies duplicates by comparing sequences in the 5 prime position of reads and read pairs in a BAM file. This is indeed a conservative approach as there remains the possibility of marking non-technical duplicates that appear to be duplicates by chance.

Duplicates were identified with Picard MarkDuplicates and the representative read was chosen by random, as opposed to a base quality scoring strategy, such that reference allele mapping bias was avoided for downstream Allele Specific Expression (ASE) testing.

dupeMetrics <- list.files(here("03_markDuplicates/log/"), full.names = TRUE) %>%
  lapply(function(x){
    sample <- basename(x) %>%
      str_remove(".tsv")
    read_tsv(x, comment = "#") %>%
      mutate(sample = sample)
  }) %>%
  bind_rows() %>%
  dplyr::select(
    sample, 
    reads = UNPAIRED_READS_EXAMINED, 
    secondary = SECONDARY_OR_SUPPLEMENTARY_RDS,
    percent.duplication = PERCENT_DUPLICATION
  )

Between 63.54% and 79.44% of reads were marked as duplicates across all samples. These large proportions are likely due to the conservative nature of the duplicate detection strategy. Additionally, this dataset consists of single-end reads, and therefore the software can only classify dulplicates based on a single sequence.

Genomic location

Variants were restricted to those that lie only in exonic regions of the Ensembl version 101 primary assembly. For zebrafish this consists of chromosomes 1 to 25. To perform this inside of GATK’s variant calling HaploypeCaller command, an interval list was created containing the exonic ranges. A number of interval list formats are supported by GATK as described here. The GATK-style .intervals is poorly described but requires the format <chr>:<start>-<end> e.g. 1:1367-1367 for a singular base position. The nomenclature must match that of the chosen reference, for example, Ensembl labels chromosome 1 as “1”, not “chr1”. The GATK-style format was chosen for its simplicity.

Defining intervals before calling variants also has the advantage of a large speed-up in computational processing time. HaplotypeCaller is the longest step in the GATK short variant discovery pipeline, sometimes taking over a day to process for large datasets on a HPC system when interval lists are not utilised.

exonRanges <- exons %>%
  unlist() %>%
  GenomicRanges::reduce() %>%
  as.data.frame() %>%
  dplyr::filter(seqnames %in% drChrs) %>%
  dplyr::select(chromosome = seqnames, start, end)
intervalPath <- "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/intervals/exons.intervals"
makeIntervals <- !file.exists(intervalPath)
if (makeIntervals) {
  dir.create(dirname(intervalPath), recursive = TRUE)
  tibble(
    interval = paste0(
      exonRanges$chromosome,
      ":",
      exonRanges$start,
      "-",
      exonRanges$end
    )
  ) %>%
    write.table(
      file = intervalPath,
      col.names = FALSE,
      row.names = FALSE,
      quote = FALSE
    )
}

Variant calling

calledMetrics <- list.files(
  # path = here("08_callSnvs/called/log/"),
  path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/called/log/",
  pattern = "detail",
  full.names = TRUE
) %>%
  lapply(read_tsv, comment = "#") %>%
  bind_rows() %>%
  dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
  as.data.frame()
filteredMetrics <- list.files(
  # path = here("08_callSnvs/filtered/log/"),
  path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/filtered/log/",
  pattern = "detail",
  full.names = TRUE
) %>%
  lapply(read_tsv, comment = "#") %>%
  bind_rows() %>%
  dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
  as.data.frame()
selectedMetrics <- list.files(
  # path = here("08_callSnvs/selected/log/"),
  path = "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/selected/log/",
  pattern = "detail",
  full.names = TRUE
) %>%
  lapply(read_tsv, comment = "#") %>%
  bind_rows() %>%
  dplyr::select(-contains(c("DBSNP", "DB_SNP", "NOVEL"))) %>%
  as.data.frame()
filtProgress <- calledMetrics %>%
  as_tibble() %>%
  dplyr::select(sample = SAMPLE_ALIAS, initial = TOTAL_SNPS)
filtProgress <- selectedMetrics %>%
  as_tibble() %>%
  dplyr::select(sample = SAMPLE_ALIAS, biallelic = TOTAL_SNPS) %>%
  left_join(filtProgress)

GATK’s initial variant calling output provides a number of different types of variants, for example indels, single nucleotide and multi nucleotide variants. The total number of all types of variants ranges between 104620 and 206524.

It is recommended that only single nucleotide variants (SNVs) are used for ASE analysis. Furthermore, only biallelic SNVs are informative for estimating allelic proportions in terms of ASE. A filter was applied such that only biallelic SNVs remained for ASE analysis. The number of biallelic SNVs ready for ASE analysis ranged between 80543 and 160622.

Allele Specific Expression

There are a number of important quality control measures that must be taken to ensure reliable data for allele specific expression (ASE) testing. The following procedures are based around the guidelines described in Castel et al. Tools and Best Practices for allelic expression analysis.

To gather SNV information as determined by the GATK workflow, VCF files were converted into GDS objects. GDS objects allow for easy access of the data in R, but are large so were kept on the HPC system.

vcfPaths <- list.files(
  "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/selected",
  pattern = ".vcf.gz$",
  full.names = TRUE
)
lapply(vcfPaths, function(vcf){
  gdsPath <- paste0(
    str_remove(dirname(vcf), "selected"),
    "gds/",
    str_remove(basename(vcf), ".vcf.gz"),
    ".gds"
  )
  if (!file.exists(gdsPath)) {
    if (!dir.exists(dirname(gdsPath))) {
      dir.create(dirname(gdsPath), recursive = TRUE)
    }
    seqVCF2GDS(vcf, gdsPath)
  }
})
gdsPaths <- list.files(
  "/hpcfs/users/a1647910/210216_sorl1_snv/08_callSnvs/gds",
  pattern = ".gds$",
  full.names = TRUE
)
snvList <- lapply(seq_along(gdsPaths), function(x){
  gdsPath <- gdsPaths[x]
  sample <- basename(gdsPath) %>%
    str_remove(".gds")
  gds <- seqOpen(gdsPath, readonly = FALSE)
  snvs <- tibble(
    variant.id = seqGetData(gds, "variant.id"),
    chromosome = seqGetData(gds, "chromosome"),
    position = seqGetData(gds, "position"),
    allele = seqGetData(gds, "allele"),
    filter = seqGetData(gds, "annotation/filter")
  ) %>%
    # dplyr::filter(
    #   filter == "PASS"
    # ) %>%
    mutate(
      sample = sample,
      refAllele = str_split(allele, ",", simplify = TRUE)[,1],
      altAllele = str_split(allele, ",", simplify = TRUE)[,2]
    )
  seqClose(gds)
  return(snvs)
})

WASP

An important aspect to consider in ASE analysis is the potential for read mapping bias. For RNA-seq data mapped to a reference genome, reads that carry an alternate allele at positions of variation have at least one mismatch, and therefore a lower probability of aligning correctly than reads containing the reference allele.

WASP provides a suite of tools for unbiased allele-specific read mapping. The WASP workflow for mappability filtering is defined by 5 steps:

  1. Reads are mapped normally with the user’s mapper of choice (STAR was chosen in this case).
  2. Reads that overlap identified SNVs and therefore may have mapping bias are determined using find_intersecting_snps.py script. Each overlapping read is output as a set of synthetic reads in FASTQ format with its allele swapped to to the other allele at the SNV site.
  3. The set of allele-swapped reads are re-mapped using the same method and options used in step 1.
  4. Reads where one or more allelic versions fail to map back to the same location are removed using filter_remapped_reads.py script.
  5. Reads that did not overlap SNVs are merged with those that show unbiased mapping from step 4 using samtools, resulting in a complete set of mappability-filtered aligned reads in BAM format.

To begin the WASP fltering procedure, input files containing genetic variants (SNVs) previously identified by the GATK workflow are needed. Since we do not have phasing information, it is recommended to use text-based format opposed to HDF5 for the input files. The text-based input files require three space-delimited columns (position, ref_allele, alt_allele), and one input file per chromosome. The filenames must follow the convention <chr>.snps.txt.gz (e.g 1.snps.txt.gz for chromosome 1).

lapply(snvList, function(x){
  chrList <- x %>%
    dplyr::select(chromosome, position, refAllele, altAllele, sample) %>%
    split(.[,'chromosome'])
  lapply(chrList, function(x){
    chr <- unique(x$chromosome)
    sample <- unique(x$sample)
    path <- paste0(
      "/hpcfs/users/a1647910/210216_sorl1_snv/09_wasp/snvs/",
      sample,
      "/",
      chr,
      ".snps.txt.gz"
    )
    if (!file.exists(path)) {
      if (!dir.exists(dirname(path))) {
        dir.create(dirname(path), recursive = TRUE)
      }
      tibble(
        position = x$position,
        refAllele = x$refAllele,
        altAllele = x$altAllele
      ) %>%
        write_delim(file = path, delim = " ", col_names = FALSE)
    }
  })
})

Text-based input files were created for WASP filtering, which was subsequently performed on the HPC system. The resulting BAM files were then subject to allele specific expression read counting with ASEReadCounter.

ASEReadCounter

GATK tool ASEReadCounter calculates allele counts at a set of positions after applying filters specifically tuned for ASE analysis of RNA-seq data. The filters operate on mapping quality, base quality, depth of coverage and overlapping paired reads. Each of these filters can be controlled by command-line arguments. The data in this analysis was generated with the following options:

  • --min-mapping-quality 10
  • --min-base-quality 20
  • --count-overlap-reads-handling COUNT_FRAGMENTS_REQUIRE_SAME_BASE
  • Ths option counts all fragments where the base is consistent when mate pairs overlap.
  • --min-depth-of-non-filtered-base -1
  • This argument is disabled, because depth filtering was performed manually (see next section: Coverage).

ASEReadCounter was run on BAM alignment files that were subject to WASP filtering and also alignment files that had not been WASP filtered, to compare whether the filtering procedure improves any observable reference allele mapping bias. Heterozygous SNVs detected by ASEReadCounter were recorded in separate tsv files for each sample. These were loaded into R as a list of tibbles containing allele count data for ASE analysis.

files_nowasp <- list.files(
  "/hpcfs/users/a1647910/210216_sorl1_snv/10_aseReadCounter/nowasp",
  full.names = TRUE
)

samples <- basename(files_nowasp) %>%
  str_remove(".tsv")
aseRC_nowasp <- lapply(files_nowasp, function(file){
  sample <- basename(file) %>%
    str_remove(".tsv")
  read_tsv(
    file,
    col_types = "cdcccdddddddd"
  ) %>%
    mutate(
      sample = sample,
      allele = paste0(refAllele, ",", altAllele)
    ) %>%
    left_join(metadata[,c("sample", "genotype")]) %>%
    dplyr::select(
      chromosome = contig, position, allele, refCount, altCount, totalCount,
      sample, genotype, lowMAPQDepth, lowBaseQDepth, rawDepth, otherBases,
      improperPairs, refAllele, altAllele
    )
}) %>% 
  set_names(samples)
files_wasp <- list.files(
  "/hpcfs/users/a1647910/210216_sorl1_snv/10_aseReadCounter/wasp",
  full.names = TRUE
)
aseRC_wasp <- lapply(files_wasp, function(file){
  sample <- basename(file) %>%
    str_remove(".tsv")
  read_tsv(
    file,
    col_types = "cdcccdddddddd"
  ) %>%
    mutate(
      sample = sample,
      allele = paste0(refAllele, ",", altAllele)
    ) %>%
    left_join(metadata[,c("sample", "genotype")]) %>%
    dplyr::select(
      chromosome = contig, position, allele, refCount, altCount, totalCount,
      sample, genotype, lowMAPQDepth, lowBaseQDepth, rawDepth, otherBases,
      improperPairs, refAllele, altAllele
    )
}) %>% 
  set_names(samples)
filtProgress <- list(
  nowasp = aseRC_nowasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      aseRC = n()
    ) %>%
    left_join(filtProgress),
  wasp = aseRC_wasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      software = n()
    ) %>%
    left_join(filtProgress)
)

Coverage

SNV sites with low read coverage are uninformative, and therefore were subject to filtering. To determine the optimal read coverage value for filtering, the effects of potential cut-offs were inspected.

covBins_nowasp <- lapply(aseRC_nowasp, function(x){
  n0 <- nrow(dplyr::filter(x, totalCount >= 0))
  n10 <- nrow(dplyr::filter(x, totalCount >= 10))
  n20 <- nrow(dplyr::filter(x, totalCount >= 20))
  n30 <- nrow(dplyr::filter(x, totalCount >= 30))
  n40 <- nrow(dplyr::filter(x, totalCount >= 40))
  n50 <- nrow(dplyr::filter(x, totalCount >= 50))
  sample <- unique(x$sample)
  tibble(
    bin = c("\u2265 0", "\u2265 10", "\u2265 20", "\u2265 30", "\u2265 40", "\u2265 50"),
    n = c(n0, n10, n20, n30, n40, n50),
    sample = sample,
    wasp = "Without WASP filtering"
  )
}) %>%
  bind_rows()
covBins_wasp <- lapply(aseRC_wasp, function(x){
  n0 <- nrow(dplyr::filter(x, totalCount >= 0))
  n10 <- nrow(dplyr::filter(x, totalCount >= 10))
  n20 <- nrow(dplyr::filter(x, totalCount >= 20))
  n30 <- nrow(dplyr::filter(x, totalCount >= 30))
  n40 <- nrow(dplyr::filter(x, totalCount >= 40))
  n50 <- nrow(dplyr::filter(x, totalCount >= 50))
  sample <- unique(x$sample)
  tibble(
    bin = c("\u2265 0", "\u2265 10", "\u2265 20", "\u2265 30", "\u2265 40", "\u2265 50"),
    n = c(n0, n10, n20, n30, n40, n50),
    sample = sample,
    wasp = "With WASP filtering"
  )
}) %>%
  bind_rows()
bind_rows(covBins_nowasp, covBins_wasp) %>%
  mutate(wasp = factor(wasp, levels = c(
    "Without WASP filtering", "With WASP filtering"
  ))) %>%
  ggplot(aes(bin, n, fill = bin)) +
  geom_boxplot() +
  labs(x = "Reads / SNV", y = "Number of SNVs") +
  theme(legend.position = "none") +
  facet_wrap(~wasp)
*The number of SNVs per sample that satisfy read coverage cut-offs*

The number of SNVs per sample that satisfy read coverage cut-offs

A cut-off of at least 10 counts per SNV position was applied, as it provides a reasonable estimate of ASE while keeping as many SNVs as possible.

aseRC_wasp <- lapply(aseRC_wasp, function(x){
  dplyr::filter(x, totalCount >= 10)
})
aseRC_nowasp <- lapply(aseRC_nowasp, function(x){
  dplyr::filter(x, totalCount >= 10)
})
filtProgress <- list(
  nowasp = aseRC_nowasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      coverage = n()
    ) %>%
    left_join(filtProgress$nowasp),
  wasp = aseRC_wasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      coverage = n()
    ) %>%
    left_join(filtProgress$wasp)
)

Mono-allelic expression

Producing evidence of mono-allelic expression from short read RNA-seq datasets without parental genotypes or imprinting information is a controversial issue. SNV sites were therefore filtered based on a depth criteria for each allele (\(\ge\) 3 counts or \(>\) 1% of the total counts for that allele).

aseRC_wasp <- lapply(aseRC_wasp, function(x){
  dplyr::filter(
    x,
    refCount >= 3,
    altCount >= 3,
    refCount / totalCount > 0.01,
    altCount / totalCount > 0.01
  )
})
aseRC_nowasp <- lapply(aseRC_nowasp, function(x){
  dplyr::filter(
    x,
    refCount >= 3,
    altCount >= 3,
    refCount / totalCount > 0.01,
    altCount / totalCount > 0.01
  )
})
filtProgress <- list(
  nowasp = aseRC_nowasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      monoallelic = n()
    ) %>%
    left_join(filtProgress$nowasp),
  wasp = aseRC_wasp %>%
    bind_rows() %>%
    group_by(sample) %>%
    summarise(
      monoallelic = n()
    ) %>%
    left_join(filtProgress$wasp)
)

Summary

Reference mapping bias

WASP filtering is designed to remove the vast majority of reads with mapping bias. Reference allele ratios were calculated for all SNVs passing filtering procedures with and without WASP filtering, to determine the effectiveness of WASP. A residual mapping bias is indicated by a large deviation in reference allele ratios from 0.5.

refRatios_nowasp <- lapply(aseRC_nowasp, function(x){
  x %>%
    dplyr::filter(totalCount != 0) %>%
    mutate(
      refRatio = refCount / totalCount,
      altRatio = altCount / totalCount
    ) %>%
    dplyr::select(
      chromosome, position, allele, refRatio, altRatio, totalCount,
      sample, genotype
    )
}) %>%
  bind_rows()
refRatios_wasp <- lapply(aseRC_wasp, function(x){
  x %>%
    dplyr::filter(totalCount != 0) %>%
    mutate(
      refRatio = refCount / totalCount,
      altRatio = altCount / totalCount
    ) %>%
    dplyr::select(
      chromosome, position, allele, refRatio, altRatio, totalCount,
      sample, genotype
    )
}) %>%
  bind_rows()

Distributions of the reference allele ratio for all SNVs were visualised to determine if any mapping bias was present. Distributions showing balanced profile without 0 or 1 inflation are representative of no mapping bias.

refRatios_nowasp %>%
  left_join(metadata[,c("sample", "genotype")]) %>%
  dplyr::arrange(genotype, sample) %>%
  ggplot(aes(refRatio, group = sample, colour = genotype)) +
  geom_density() +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  labs(x = "Reference allele ratio", y = "Density", colour = "Genotype") +
  theme(legend.position = "bottom") +
  scale_color_manual(values = genoCols) +
  ggtitle("Reference allele ratio distributions without WASP filtering")
*Sample distributions of global reference allelic ratios for remaining SNVs when no WASP filtering was applied. Distributions are coloured by genotype. All samples tend to follow a similar distribution with inflations towards 1, indicating that a reference allele mapping bias is present.*

Sample distributions of global reference allelic ratios for remaining SNVs when no WASP filtering was applied. Distributions are coloured by genotype. All samples tend to follow a similar distribution with inflations towards 1, indicating that a reference allele mapping bias is present.

refRatios_wasp %>%
  left_join(metadata[,c("sample", "genotype")]) %>%
  dplyr::arrange(genotype, sample) %>%
  ggplot(aes(refRatio, group = sample, colour = genotype)) +
  geom_density() +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  labs(x = "Reference allele ratio", y = "Density", colour = "Genotype") +
  theme(legend.position = "bottom") +
  scale_color_manual(values = genoCols) +
  ggtitle("Reference allele ratio distributions with WASP filtering")
*Sample distributions of global reference allelic ratios for remaining SNVs when WASP filtering was incorporated into the quality control procedure. Distributions are coloured by genotype. All samples show a reasonably balanced profile, indicating that WASP filtering successfully improved reference mapping bias.*

Sample distributions of global reference allelic ratios for remaining SNVs when WASP filtering was incorporated into the quality control procedure. Distributions are coloured by genotype. All samples show a reasonably balanced profile, indicating that WASP filtering successfully improved reference mapping bias.

As an alternative viewpoint, boxplots of reference allele ratios were plotted for each sample. Median/mean reference allele ratios that deviate substantially from 0.5 indicate a mapping bias.

refRatios_nowasp %>%
  left_join(metadata[,c("sample", "genotype")]) %>%
  dplyr::arrange(genotype, sample) %>%
  mutate(sample = factor(sample, levels = unique(sample))) %>%
  ggplot(aes(sample, refRatio, fill = genotype)) +
  geom_boxplot(fatten = 3, outlier.shape = NA) +
  geom_hline(yintercept = 0.5, colour = "black") +
  stat_summary(
    fun = mean, geom = "point", colour = "white", shape = "\U2012", size = 6
  ) +
  labs(x = "Sample", y = "Reference allele ratio") +
  scale_fill_manual(name = "Genotype", values = genoCols) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  theme(
    axis.text.x = element_text(angle = -90, vjust = 0.5),
    legend.position = "bottom"
  ) +
  ggtitle(label = "Reference allele ratios without WASP filtering")
*Boxplots showing reference allele ratios for remaining SNVs when no WASP filtering was applied. The mean reference ratio for each sample is indicated with a white dash, while the solid black line indicates a reference ratio of 0.5. Outliers are hidden for the ease of viewing. As indicated by mean and median reference ratios above 0.5, reference mapping bias is evident.*

Boxplots showing reference allele ratios for remaining SNVs when no WASP filtering was applied. The mean reference ratio for each sample is indicated with a white dash, while the solid black line indicates a reference ratio of 0.5. Outliers are hidden for the ease of viewing. As indicated by mean and median reference ratios above 0.5, reference mapping bias is evident.

refRatios_wasp %>%
  dplyr::filter(totalCount >= 10) %>%
  left_join(metadata[,c("sample", "genotype")]) %>%
  dplyr::arrange(genotype, sample) %>%
  mutate(sample = factor(sample, levels = unique(sample))) %>%
  ggplot(aes(sample, refRatio, fill = genotype)) +
  geom_boxplot(fatten = 3, outlier.shape = NA) +
  geom_hline(yintercept = 0.5, colour = "black") +
  stat_summary(
    fun = mean, geom = "point", colour = "white", shape = "\U2012", size = 6
  ) +
  labs(x = "Sample", y = "Reference allele ratio") +
  scale_fill_manual(name = "Genotype", values = genoCols) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  guides(colour = FALSE) +
  theme(
    axis.text.x = element_text(angle = -90, vjust = 0.5),
    legend.position = "bottom"
  ) +
  ggtitle(label = "Reference allele ratios with WASP filtering")
*Boxplots showing reference allele ratios for remaining SNVs when WASP filtering was incorporated into the quality control procedure. The mean reference ratio for each sample is indicated with a white dash, while the solid black line indicates a reference ratio of 0.5. Outliers are hidden for the ease of viewing. Residual reference mapping bias remains in the data, however there appears to be an improvement over data that was not subject to WASP filtering.*

Boxplots showing reference allele ratios for remaining SNVs when WASP filtering was incorporated into the quality control procedure. The mean reference ratio for each sample is indicated with a white dash, while the solid black line indicates a reference ratio of 0.5. Outliers are hidden for the ease of viewing. Residual reference mapping bias remains in the data, however there appears to be an improvement over data that was not subject to WASP filtering.

Whilst not perfect, WASP filtering improved the reference allele mapping bias and therefore was WASP filtered data chosen for ASE testing.

aseRC <- aseRC_wasp
refRatios <- refRatios_wasp
filtProgress <- filtProgress$wasp
refBias <- refRatios %>%
  bind_rows() %>%
  group_by(sample) %>%
  summarise(mean = mean(refRatio), median = median(refRatio))
altBias <- refRatios %>%
  bind_rows() %>%
  group_by(sample) %>%
  summarise(mean = mean(altRatio), median = median(altRatio))

After quality control procedures, mean reference ratios ranged between 0.514 and 0.519. Median reference ratios ranged between 0.5 and 0.515. The mean reference ratios for each sample will be used as the null in statistical testing for ASE, as this can improve results (Castel et al.).

Filtering

filtProgress %>%
  dplyr::select(
    sample, None = initial, `Biallelic SNVs` = biallelic,
    `WASP/ASEReadCounter` = software,
    Coverage = coverage, `Mono-allelic expression` = monoallelic
  ) %>%
  pivot_longer(cols = -sample, names_to = "Filter", values_to = "SNVs") %>%
  mutate(Filter = factor(Filter, levels = unique(.$Filter))) %>%
  ggplot(aes(Filter, SNVs, group = Filter, fill = Filter)) +
  geom_boxplot() +
  labs(y = "Number of SNVs") +
  theme(
    legend.position = "none",
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank()
  )
*Boxplots of the total number of SNVs remaining in each sample after various quality control filtering steps. The filters were applied sequentially from left to right across the x-axis, and as such the remaining number of SNVs follows a decreasing trend.*

Boxplots of the total number of SNVs remaining in each sample after various quality control filtering steps. The filters were applied sequentially from left to right across the x-axis, and as such the remaining number of SNVs follows a decreasing trend.

Coverage

covPlotA <- lapply(aseRC, function(x){
  n10 <- nrow(dplyr::filter(x, totalCount >= 10))
  n20 <- nrow(dplyr::filter(x, totalCount >= 20))
  n30 <- nrow(dplyr::filter(x, totalCount >= 30))
  n40 <- nrow(dplyr::filter(x, totalCount >= 40))
  n50 <- nrow(dplyr::filter(x, totalCount >= 50))
  sample <- unique(x$sample)
  tibble(
    bin = c("\u2265 10", "\u2265 20", "\u2265 30", "\u2265 40", "\u2265 50"),
    n = c(n10, n20, n30, n40, n50),
    sample = sample
  )
}) %>%
  bind_rows() %>%
  ggplot(aes(bin, n, fill = bin)) +
  geom_boxplot() +
  labs(x = "Reads / SNV", y = "SNVs per sample") +
  scale_y_continuous(breaks = seq(0, 100000, 10000)) +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank()
  )
exonOverlaps <- lapply(aseRC, function(x){
  indices <- x %>%
    dplyr::select(seqnames = chromosome, start = position) %>%
    mutate(end = start) %>%
    makeGRangesFromDataFrame() %>%
    findOverlaps(exons) %>%
    as.data.frame()
  tibble(
    snvInd = indices$queryHits,
    geneInd = indices$subjectHits
  )
})
aseRCByGene <- lapply(seq_along(aseRC), function(x){
  aseRC[[x]][exonOverlaps[[x]]$snvInd,] %>%
    cbind(gene = names(exons[exonOverlaps[[x]]$geneInd])) %>%
    as_tibble()
})
covPlotB <- lapply(aseRCByGene, function(x){
  n10 <- length(unique(pull(dplyr::filter(x, totalCount >= 10), gene)))
  n20 <- length(unique(pull(dplyr::filter(x, totalCount >= 20), gene)))
  n30 <- length(unique(pull(dplyr::filter(x, totalCount >= 30), gene)))
  n40 <- length(unique(pull(dplyr::filter(x, totalCount >= 40), gene)))
  n50 <- length(unique(pull(dplyr::filter(x, totalCount >= 50), gene)))
  sample <- unique(x$sample)
  tibble(
    bin = c("\u2265 10", "\u2265 20", "\u2265 30", "\u2265 40", "\u2265 50"),
    n = c(n10, n20, n30, n40, n50),
    sample = sample
  )
}) %>%
  bind_rows() %>%
  ggplot(aes(bin, n, fill = bin)) +
  geom_boxplot() +
  labs(x = "Reads / SNV", y = "Gene count") +
  scale_y_continuous(breaks = seq(0, 10000, 1000)) +
  theme(
    legend.position = "none",
    panel.grid.major.x = element_blank()
  )
covPlotC <- lapply(aseRCByGene, function(x){
  x %>%
    dplyr::select(chromosome, position, totalCount, sample, gene) %>%
    mutate(
      n10 = totalCount >= 10,
      n20 = totalCount >= 20,
      n30 = totalCount >= 30,
      n40 = totalCount >= 40,
      n50 = totalCount >= 50
    ) %>%
    group_by(gene) %>%
    summarise(
      n10 = sum(n10),
      n20 = sum(n20),
      n30 = sum(n30),
      n40 = sum(n40), 
      n50 = sum(n50)
    ) %>%
    ungroup() %>%
    pivot_longer(
      cols = c("n10", "n20", "n30", "n40", "n50"),
      names_to = "bin",
      values_to = "snvs"
    ) %>%
    group_by(bin, snvs) %>%
    summarise(genes = n()) %>%
    ungroup() %>%
    dplyr::filter(snvs != 0)
}) %>%
  bind_rows() %>%
  group_by(bin, snvs) %>%
  summarise(medianGenes = median(genes)) %>%
  ggplot(aes(snvs, medianGenes, colour = bin)) +
  geom_point() +
  geom_line() +
  scale_colour_discrete(
    labels = c("\u2265 10", "\u2265 20", "\u2265 30", "\u2265 40", "\u2265 50")
  ) +
  scale_x_continuous(minor_breaks = seq(0, 30, 1)) +
  scale_y_continuous(breaks = seq(0, 5000, 100)) +
  coord_cartesian(xlim = c(0, 25)) +
  labs(x = "SNVs / gene", y = "Median gene count", colour = "Coverage") +
  theme(legend.position= c(0.75, 0.75))
ggarrange(covPlotA, covPlotB, covPlotC, nrow = 1, labels = "AUTO")
*Genomic coverage of ASE data. **A**, **B** The number of SNVs (**A**) and protein coding genes (**B**) per sample at different read coverage cut-offs. **C** The number of protein coding genes against the number of SNVs they contain at different read coverage cut-offs. Each line represents the median value for all samples.*

Genomic coverage of ASE data. A, B The number of SNVs (A) and protein coding genes (B) per sample at different read coverage cut-offs. C The number of protein coding genes against the number of SNVs they contain at different read coverage cut-offs. Each line represents the median value for all samples.

Export

For static ASE testing, which describes the difference between two variants of a heterozygous allele in a single sample in an unchanging condition, geneiASE software was chosen. geneiASE requires allele counts tsv format with four columns: featureID, snpID, alternative allele count, reference allele count. tsv files (one for each sample) were exported for ASE testing.

staticCounts <- lapply(aseRCByGene, function(x){
  x %>%
    rownames_to_column("snvID") %>%
    left_join(altBias[,c("sample", "mean")]) %>%
    dplyr::select(
      gene, snvID, altCount, refCount, betabinom.p = mean
    )
}) %>%
  set_names(samples)
lapply(seq_along(staticCounts), function(x){
  path <- paste0(
    "/hpcfs/users/a1647910/210216_sorl1_snv/11_geneiase/counts/",
    names(staticCounts[x]),
    ".static.tsv"
  )
  if (!file.exists(path)) {
    if (!dir.exists(dirname(path))) {
      dir.create(dirname(path), recursive = TRUE)
    }
    write_tsv(staticCounts[[x]], path)
  }
})
Session information
sessionInfo() %>%
  pander()

R version 4.1.0 (2021-05-18)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ensembldb(v.2.16.2), AnnotationFilter(v.1.16.0), GenomicFeatures(v.1.44.0), AnnotationDbi(v.1.54.1), Biobase(v.2.52.0), GenomicRanges(v.1.44.0), GenomeInfoDb(v.1.28.1), IRanges(v.2.26.0), S4Vectors(v.0.30.0), ngsReports(v.1.8.1), SeqArray(v.1.32.0), gdsfmt(v.1.28.0), UpSetR(v.1.4.0), rmarkdown(v.2.9), pander(v.0.6.4), ggpubr(v.0.4.0), RColorBrewer(v.1.1-2), ggrepel(v.0.9.1), tictoc(v.1.0.1), kableExtra(v.1.3.4), scales(v.1.1.1), AnnotationHub(v.3.0.1), BiocFileCache(v.2.0.0), dbplyr(v.2.1.1), BiocGenerics(v.0.38.0), here(v.1.0.1), future.apply(v.1.7.0), future(v.1.21.0), magrittr(v.2.0.1), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.7), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.3), tibble(v.3.1.2), ggplot2(v.3.3.5) and tidyverse(v.1.3.1)

loaded via a namespace (and not attached): readxl(v.1.3.1), backports(v.1.2.1), systemfonts(v.1.0.2), plyr(v.1.8.6), lazyeval(v.0.2.2), crosstalk(v.1.1.1), BiocParallel(v.1.26.1), listenv(v.0.8.0), digest(v.0.6.27), htmltools(v.0.5.1.1), fansi(v.0.5.0), memoise(v.2.0.0), openxlsx(v.4.2.4), globals(v.0.14.0), Biostrings(v.2.60.1), modelr(v.0.1.8), matrixStats(v.0.59.0), svglite(v.2.0.0), prettyunits(v.1.1.1), colorspace(v.2.0-2), blob(v.1.2.1), rvest(v.1.0.0), rappdirs(v.0.3.3), haven(v.2.4.1), xfun(v.0.24), crayon(v.1.4.1), RCurl(v.1.98-1.3), jsonlite(v.1.7.2), zoo(v.1.8-9), glue(v.1.4.2), gtable(v.0.3.0), zlibbioc(v.1.38.0), XVector(v.0.32.0), webshot(v.0.5.2), DelayedArray(v.0.18.0), car(v.3.0-11), abind(v.1.4-5), DBI(v.1.1.1), rstatix(v.0.7.0), Rcpp(v.1.0.7), progress(v.1.2.2), viridisLite(v.0.4.0), xtable(v.1.8-4), foreign(v.0.8-81), bit(v.4.0.4), DT(v.0.18), htmlwidgets(v.1.5.3), httr(v.1.4.2), ellipsis(v.0.3.2), farver(v.2.1.0), XML(v.3.99-0.6), pkgconfig(v.2.0.3), sass(v.0.4.0), utf8(v.1.2.1), labeling(v.0.4.2), tidyselect(v.1.1.1), rlang(v.0.4.11), later(v.1.2.0), munsell(v.0.5.0), BiocVersion(v.3.13.1), cellranger(v.1.1.0), tools(v.4.1.0), cachem(v.1.0.5), cli(v.3.0.0), generics(v.0.1.0), RSQLite(v.2.2.7), broom(v.0.7.8), evaluate(v.0.14), fastmap(v.1.1.0), ggdendro(v.0.1.22), yaml(v.2.2.1), knitr(v.1.33), bit64(v.4.0.5), fs(v.1.5.0), zip(v.2.2.0), KEGGREST(v.1.32.0), mime(v.0.11), xml2(v.1.3.2), biomaRt(v.2.48.2), compiler(v.4.1.0), rstudioapi(v.0.13), plotly(v.4.9.4.1), filelock(v.1.0.2), curl(v.4.3.2), png(v.0.1-7), interactiveDisplayBase(v.1.30.0), ggsignif(v.0.6.2), reprex(v.2.0.0), bslib(v.0.2.5.1), stringi(v.1.6.2), highr(v.0.9), lattice(v.0.20-44), ProtGenerics(v.1.24.0), Matrix(v.1.3-4), vctrs(v.0.3.8), pillar(v.1.6.1), lifecycle(v.1.0.0), BiocManager(v.1.30.16), jquerylib(v.0.1.4), cowplot(v.1.1.1), data.table(v.1.14.0), bitops(v.1.0-7), rtracklayer(v.1.52.0), httpuv(v.1.6.1), BiocIO(v.1.2.0), R6(v.2.5.0), promises(v.1.2.0.1), gridExtra(v.2.3), rio(v.0.5.27), parallelly(v.1.26.1), codetools(v.0.2-18), MASS(v.7.3-54), assertthat(v.0.2.1), SummarizedExperiment(v.1.22.0), rjson(v.0.2.20), rprojroot(v.2.0.2), withr(v.2.4.2), GenomicAlignments(v.1.28.0), Rsamtools(v.2.8.0), GenomeInfoDbData(v.1.2.6), hms(v.1.1.0), grid(v.4.1.0), MatrixGenerics(v.1.4.0), carData(v.3.0-4), shiny(v.1.6.0), lubridate(v.1.7.10) and restfulr(v.0.0.13)